@@ -23,14 +23,16 @@ module Agents |
||
| 23 | 23 |
|
| 24 | 24 |
To tell the Agent how to parse the content, specify `extract` as a hash with keys naming the extractions and values of hashes. |
| 25 | 25 |
|
| 26 |
- When parsing HTML or XML, these sub-hashes specify how to extract with either a `css` CSS selector or a `xpath` XPath expression and either `"text": true` or `attr` pointing to an attribute name to grab. An example: |
|
| 26 |
+ When parsing HTML or XML, these sub-hashes specify how each extraction should be done. The Agent first selects a node set from the document for each extraction key by evaluating either a CSS selector in `css` or an XPath expression in `xpath`. It then evaluates an XPath expression in `value` on each node in the node set, converting the result into string. Here's an example: |
|
| 27 | 27 |
|
| 28 | 28 |
"extract": {
|
| 29 |
- "url": { "css": "#comic img", "attr": "src" },
|
|
| 30 |
- "title": { "css": "#comic img", "attr": "title" },
|
|
| 31 |
- "body_text": { "css": "div.main", "text": true }
|
|
| 29 |
+ "url": { "css": "#comic img", "value": "@src" },
|
|
| 30 |
+ "title": { "css": "#comic img", "value": "@title" },
|
|
| 31 |
+ "body_text": { "css": "div.main", "value": ".//text()" }
|
|
| 32 | 32 |
} |
| 33 | 33 |
|
| 34 |
+ "@_attr_" is the XPath expression to extract the value of an attribute named _attr_ from a node, and ".//text()" is to extract all the enclosed texts. You can also use [XPath functions](http://www.w3.org/TR/xpath/#section-String-Functions) like `normalize-space` to strip and squeeze whitespace, `substring-after` to extract part of a text, and `translate` to remove comma from a formatted number, etc. Note that these functions take a string, not a node set, so what you may think would be written as `normalize-text(.//text())` should actually be `normalize-text(.)`. |
|
| 35 |
+ |
|
| 34 | 36 |
When parsing JSON, these sub-hashes specify [JSONPaths](http://goessner.net/articles/JsonPath/) to the values that you care about. For example: |
| 35 | 37 |
|
| 36 | 38 |
"extract": {
|
@@ -70,9 +72,9 @@ module Agents |
||
| 70 | 72 |
'type' => "html", |
| 71 | 73 |
'mode' => "on_change", |
| 72 | 74 |
'extract' => {
|
| 73 |
- 'url' => { 'css' => "#comic img", 'attr' => "src" },
|
|
| 74 |
- 'title' => { 'css' => "#comic img", 'attr' => "alt" },
|
|
| 75 |
- 'hovertext' => { 'css' => "#comic img", 'attr' => "title" }
|
|
| 75 |
+ 'url' => { 'css' => "#comic img", 'value' => "@src" },
|
|
| 76 |
+ 'title' => { 'css' => "#comic img", 'value' => "@alt" },
|
|
| 77 |
+ 'hovertext' => { 'css' => "#comic img", 'value' => "@title" }
|
|
| 76 | 78 |
} |
| 77 | 79 |
} |
| 78 | 80 |
end |
@@ -152,20 +154,21 @@ module Agents |
||
| 152 | 154 |
error '"css" or "xpath" is required for HTML or XML extraction' |
| 153 | 155 |
return |
| 154 | 156 |
end |
| 155 |
- unless Nokogiri::XML::NodeSet === nodes |
|
| 157 |
+ case nodes |
|
| 158 |
+ when Nokogiri::XML::NodeSet |
|
| 159 |
+ result = nodes.map { |node|
|
|
| 160 |
+ case value = node.xpath(extraction_details['value']) |
|
| 161 |
+ when Float |
|
| 162 |
+ # Node#xpath() returns any numeric value as float; |
|
| 163 |
+ # convert it to integer as appropriate. |
|
| 164 |
+ value = value.to_i if value.to_i == value |
|
| 165 |
+ end |
|
| 166 |
+ value.to_s |
|
| 167 |
+ } |
|
| 168 |
+ else |
|
| 156 | 169 |
error "The result of HTML/XML extraction was not a NodeSet" |
| 157 | 170 |
return |
| 158 | 171 |
end |
| 159 |
- result = nodes.map { |node|
|
|
| 160 |
- if extraction_details['attr'] |
|
| 161 |
- node.attr(extraction_details['attr']) |
|
| 162 |
- elsif extraction_details['text'] |
|
| 163 |
- node.text() |
|
| 164 |
- else |
|
| 165 |
- error '"attr" or "text" is required on HTML or XML extraction patterns' |
|
| 166 |
- return |
|
| 167 |
- end |
|
| 168 |
- } |
|
| 169 | 172 |
log "Extracting #{extraction_type} at #{xpath || css}: #{result}"
|
| 170 | 173 |
end |
| 171 | 174 |
output[name] = result |
@@ -0,0 +1,30 @@ |
||
| 1 |
+class AdoptXpathInWebsiteAgent < ActiveRecord::Migration |
|
| 2 |
+ class Agent < ActiveRecord::Base |
|
| 3 |
+ include JSONSerializedField |
|
| 4 |
+ json_serialize :options |
|
| 5 |
+ end |
|
| 6 |
+ |
|
| 7 |
+ def up |
|
| 8 |
+ Agent.where(type: 'Agents::WebsiteAgent').each do |agent| |
|
| 9 |
+ extract = agent.options['extract'] |
|
| 10 |
+ next unless extract.is_a?(Hash) && extract.all? { |name, detail|
|
|
| 11 |
+ detail.key?('xpath') || detail.key?('css')
|
|
| 12 |
+ } |
|
| 13 |
+ |
|
| 14 |
+ agent.options_will_change! |
|
| 15 |
+ agent.options['extract'].each { |name, extraction|
|
|
| 16 |
+ case |
|
| 17 |
+ when extraction.delete('text')
|
|
| 18 |
+ extraction['value'] = './/text()' |
|
| 19 |
+ when attr = extraction.delete('attr')
|
|
| 20 |
+ extraction['value'] = "@#{attr}"
|
|
| 21 |
+ end |
|
| 22 |
+ } |
|
| 23 |
+ agent.save! |
|
| 24 |
+ end |
|
| 25 |
+ end |
|
| 26 |
+ |
|
| 27 |
+ def down |
|
| 28 |
+ raise ActiveRecord::IrreversibleMigration, "Cannot revert this migration" |
|
| 29 |
+ end |
|
| 30 |
+end |
@@ -10,8 +10,8 @@ jane_website_agent: |
||
| 10 | 10 |
:expected_update_period_in_days => 2, |
| 11 | 11 |
:mode => :on_change, |
| 12 | 12 |
:extract => {
|
| 13 |
- :title => {:css => "item title", :text => true},
|
|
| 14 |
- :url => {:css => "item link", :text => true}
|
|
| 13 |
+ :title => {:css => "item title", :value => './/text()'},
|
|
| 14 |
+ :url => {:css => "item link", :value => './/text()'}
|
|
| 15 | 15 |
} |
| 16 | 16 |
}.to_json.inspect %> |
| 17 | 17 |
|
@@ -27,8 +27,8 @@ bob_website_agent: |
||
| 27 | 27 |
:expected_update_period_in_days => 2, |
| 28 | 28 |
:mode => :on_change, |
| 29 | 29 |
:extract => {
|
| 30 |
- :url => {:css => "#comic img", :attr => "src"},
|
|
| 31 |
- :title => {:css => "#comic img", :attr => "title"}
|
|
| 30 |
+ :url => {:css => "#comic img", :value => "@src"},
|
|
| 31 |
+ :title => {:css => "#comic img", :value => "@title"}
|
|
| 32 | 32 |
} |
| 33 | 33 |
}.to_json.inspect %> |
| 34 | 34 |
|
@@ -768,8 +768,8 @@ describe AgentDrop do |
||
| 768 | 768 |
url: 'http://dilbert.com/', |
| 769 | 769 |
mode: 'on_change', |
| 770 | 770 |
extract: {
|
| 771 |
- url: { css: '[id^=strip_enlarged_] img', attr: 'src' },
|
|
| 772 |
- title: { css: '.STR_DateStrip', text: true },
|
|
| 771 |
+ url: { css: '[id^=strip_enlarged_] img', value: '@src' },
|
|
| 772 |
+ title: { css: '.STR_DateStrip', value: './/text()' },
|
|
| 773 | 773 |
}, |
| 774 | 774 |
}, |
| 775 | 775 |
schedule: 'every_12h', |
@@ -11,9 +11,9 @@ describe Agents::WebsiteAgent do |
||
| 11 | 11 |
'url' => "http://xkcd.com", |
| 12 | 12 |
'mode' => 'on_change', |
| 13 | 13 |
'extract' => {
|
| 14 |
- 'url' => { 'css' => "#comic img", 'attr' => "src" },
|
|
| 15 |
- 'title' => { 'css' => "#comic img", 'attr' => "alt" },
|
|
| 16 |
- 'hovertext' => { 'css' => "#comic img", 'attr' => "title" }
|
|
| 14 |
+ 'url' => { 'css' => "#comic img", 'value' => "@src" },
|
|
| 15 |
+ 'title' => { 'css' => "#comic img", 'value' => "@alt" },
|
|
| 16 |
+ 'hovertext' => { 'css' => "#comic img", 'value' => "@title" }
|
|
| 17 | 17 |
} |
| 18 | 18 |
} |
| 19 | 19 |
@checker = Agents::WebsiteAgent.new(:name => "xkcd", :options => @valid_options, :keep_events_for => 2) |
@@ -256,8 +256,7 @@ describe Agents::WebsiteAgent do |
||
| 256 | 256 |
'url' => "http://xkcd.com", |
| 257 | 257 |
'mode' => "on_change", |
| 258 | 258 |
'extract' => {
|
| 259 |
- 'url' => {'css' => "#topLeft a", 'attr' => "href"},
|
|
| 260 |
- 'title' => {'css' => "#topLeft a", 'text' => "true"}
|
|
| 259 |
+ 'url' => {'css' => "#topLeft a", 'value' => "@href"},
|
|
| 261 | 260 |
} |
| 262 | 261 |
} |
| 263 | 262 |
rel = Agents::WebsiteAgent.new(:name => "xkcd", :options => rel_site) |
@@ -268,6 +267,44 @@ describe Agents::WebsiteAgent do |
||
| 268 | 267 |
event.payload['url'].should == "http://xkcd.com/about" |
| 269 | 268 |
end |
| 270 | 269 |
|
| 270 |
+ it "should return an integer value if XPath evaluates to one" do |
|
| 271 |
+ rel_site = {
|
|
| 272 |
+ 'name' => "XKCD", |
|
| 273 |
+ 'expected_update_period_in_days' => 2, |
|
| 274 |
+ 'type' => "html", |
|
| 275 |
+ 'url' => "http://xkcd.com", |
|
| 276 |
+ 'mode' => "on_change", |
|
| 277 |
+ 'extract' => {
|
|
| 278 |
+ 'num_links' => {'css' => "#comicLinks", 'value' => "count(./a)"}
|
|
| 279 |
+ } |
|
| 280 |
+ } |
|
| 281 |
+ rel = Agents::WebsiteAgent.new(:name => "xkcd", :options => rel_site) |
|
| 282 |
+ rel.user = users(:bob) |
|
| 283 |
+ rel.save! |
|
| 284 |
+ rel.check |
|
| 285 |
+ event = Event.last |
|
| 286 |
+ event.payload['num_links'].should == "9" |
|
| 287 |
+ end |
|
| 288 |
+ |
|
| 289 |
+ it "should return all texts concatenated if XPath returns many text nodes" do |
|
| 290 |
+ rel_site = {
|
|
| 291 |
+ 'name' => "XKCD", |
|
| 292 |
+ 'expected_update_period_in_days' => 2, |
|
| 293 |
+ 'type' => "html", |
|
| 294 |
+ 'url' => "http://xkcd.com", |
|
| 295 |
+ 'mode' => "on_change", |
|
| 296 |
+ 'extract' => {
|
|
| 297 |
+ 'slogan' => {'css' => "#slogan", 'value' => ".//text()"}
|
|
| 298 |
+ } |
|
| 299 |
+ } |
|
| 300 |
+ rel = Agents::WebsiteAgent.new(:name => "xkcd", :options => rel_site) |
|
| 301 |
+ rel.user = users(:bob) |
|
| 302 |
+ rel.save! |
|
| 303 |
+ rel.check |
|
| 304 |
+ event = Event.last |
|
| 305 |
+ event.payload['slogan'].should == "A webcomic of romance, sarcasm, math, and language." |
|
| 306 |
+ end |
|
| 307 |
+ |
|
| 271 | 308 |
describe "JSON" do |
| 272 | 309 |
it "works with paths" do |
| 273 | 310 |
json = {
|
@@ -389,9 +426,9 @@ describe Agents::WebsiteAgent do |
||
| 389 | 426 |
'url' => "http://www.example.com", |
| 390 | 427 |
'mode' => 'on_change', |
| 391 | 428 |
'extract' => {
|
| 392 |
- 'url' => { 'css' => "#comic img", 'attr' => "src" },
|
|
| 393 |
- 'title' => { 'css' => "#comic img", 'attr' => "alt" },
|
|
| 394 |
- 'hovertext' => { 'css' => "#comic img", 'attr' => "title" }
|
|
| 429 |
+ 'url' => { 'css' => "#comic img", 'value' => "@src" },
|
|
| 430 |
+ 'title' => { 'css' => "#comic img", 'value' => "@alt" },
|
|
| 431 |
+ 'hovertext' => { 'css' => "#comic img", 'value' => "@title" }
|
|
| 395 | 432 |
}, |
| 396 | 433 |
'basic_auth' => "user:pass" |
| 397 | 434 |
} |
@@ -421,7 +458,7 @@ describe Agents::WebsiteAgent do |
||
| 421 | 458 |
'mode' => 'on_change', |
| 422 | 459 |
'headers' => { 'foo' => 'bar' },
|
| 423 | 460 |
'extract' => {
|
| 424 |
- 'url' => { 'css' => "#comic img", 'attr' => "src" },
|
|
| 461 |
+ 'url' => { 'css' => "#comic img", 'value' => "@src" },
|
|
| 425 | 462 |
} |
| 426 | 463 |
} |
| 427 | 464 |
@checker = Agents::WebsiteAgent.new(:name => "ua", :options => @valid_options) |